#-------------------------------------------------------------------------------

# Text analysis on UNSC, UNGA resolutions and vetoes
# Coding dates: June 2022-October 2023
# Coded by: Andrea Knapp (andrea.knapp2@unibo.it)

#-------------------------------------------------------------------------------

# Sections:
# 1) Load before the analysis
# 2) Creation of data-frame matrix
# 3) Select number of k
# 4) STM with different k from 3 to 15
# 5) Select number of k
# 6) Chi-squared tests
# 7) 6k STM model with covariates
# 8) Word cloud
# 9) Exclusivity and semantic coherence validation
# 10) Exclusivity and semantic coherence validation
# 11) Most representative documents for topics
# 12) Estimating effects
# 13) Visual representation of breakpoints
# 14) Replication analysis with two different corpora (UNSC and UNGA)

# analysis was conducted with RStudio version 2023.12.0+369 running in R version 4.3.2

#-------------------------------------------------------------------------------

# 1) Load before the analysis

# install packages
# ATTENTION: Skip this section if packages are already installed
install.packages("devtools")
install.packages("readtext")
install.packages("quanteda")
install.packages("gfortran")
install.packages("ggplot2")
install.packages("tidyverse")
install.packages("tidytext")
install.packages("dplyr")
devtools::install_github("matthewjdenny/preText", force = TRUE)
install.packages("quanteda.textstats")
install.packages("stm")
install.packages("topicmodels")
install.packages("ldatuning")
install.packages("quanteda.textplots")
install.packages("tm")
install.packages("reshape2")
install.packages("wordcloud")
install.packages("pals")
install.packages("SnowballC")
install.packages("lda")
install.packages("TTR")
install.packages("strucchange")
install.packages("forecast")
install.packages("tseries")
install.packages("ggpubr")
install.packages("stminsights")
devtools::install_github("mikajoh/tidystm", dependencies = TRUE, force = TRUE)
install.packages("scales")
install.packages("countrycode")
install.packages("DT")
install.packages("mapproj")
install.packages("sf")
install.packages("RgoogleMaps")
install.packages("rworldmap")
install.packages("maps")
install.packages("rnaturalearth") 
install.packages("rnaturalearthdata")
install.packages("ggspatial")
install.packages("stminsights")
install.packages("leaflet")
install.packages("tmap")
install.packages("here")
install.packages("wesanderson")
install.packages("here")
install.packages("flextable")

# load packages
rm(list = ls())
library(devtools)
library(readtext)
library(quanteda)
library(ggplot2)
library(tidyverse)
library(tidytext)
library(dplyr)
library(preText)
library(quanteda.textstats)
library(stm)
library(topicmodels)
library(ldatuning)
library(quanteda.textplots)
library(tm)
library(reshape2)
library(wordcloud)
library(pals)
library(SnowballC)
library(lda)
library(TTR)
library(strucchange)
library(forecast)
library(tseries)
library(ggpubr)
library(stminsights)
library(tidystm)
library(scales)
library(countrycode)
library(DT)
library(mapproj)
library(sf)
library(RgoogleMaps)
library(rworldmap)
library(maps)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggspatial)
library(stminsights)
library(leaflet)
library(tmap)
library(wesanderson)
library(here)
library(flextable)

# set working directory
setwd("...") # replace with own working directory
load("data_complete.rda")
load("corp_tot.rda")
load("token_tot.rda")

#-------------------------------------------------------------------------------

# 2) Creation of data-frame matrix and pre-processing

dfm_tot <- dfm(token_tot)
nfeat(dfm_tot)  # 11073
dfm_tot <- dfm_trim(dfm_tot, min_termfreq = 0.9,
                    termfreq_type = "quantile") 
dfm_tot <- dfm_trim(dfm_tot,
                    max_termfreq = 5000, 
                    min_termfreq = 100) 
nfeat(dfm_tot) # 1071

# check distribution of documents over years
docutab <- table(data_complete$Year)
proptab <- prop.table(table(data_complete$Year))

# check distribution of documents over countries
docutab2 <- table(data_complete$Country)
proptab2 <- prop.table(table(data_complete$Country))

# plot
par(mfrow = c(1, 2)) 
plot(docutab, main = "Distribution of document frequency over years", 
     ylab = "Frequency", xlab = "Years")
plot(docutab2, main = "Distribution of document frequency over countries", 
     ylab = "Frequency", xlab = "Countries")
par(mfrow = c(1, 1)) 


#-------------------------------------------------------------------------------

# 3) Select number of k

# convert to stm object
dfm_searchK <- convert(dfm_tot, to = "stm")

# select k between 3 and 50
# ATTENTION: Te following lines (169-170) take multiple hours to run. Instead of running the model,
# the data can be loaded using load("storage.rda")
storage <- searchK(dfm_searchK$documents, 
                   dfm_searchK$vocab,
                   K = c(3:50),
                   data = meta.dat,
                   init.type = "Spectral",
                   max.em.its = 75)

# plot results
plot <- data.frame("K" = c(3:50), 
                   "Coherence" = unlist(storage$results$semcoh),
                   "Exclusivity" = unlist(storage$results$exclus))
plot <- melt(plot, id=c("K"))
ggplot(plot, aes(K, value, color = variable, linetype = variable)) +
  geom_line(size = 1, show.legend = F) +
  facet_wrap(~variable,scales = "free_y") + 
  scale_linetype_manual(values = c("solid", "dashed")) +
  labs(x = "Number of k", y = "Value") + theme_minimal()

# diagnostic values per number of topics
evaluation <- data.frame(storage$results)
evaluation <- data.frame(lapply(evaluation, function(x) as.numeric(x)))
ggplot(evaluation, aes(semcoh, exclus)) +
  geom_text(aes(label=K), size = 4) + color_palette("grey") +
  labs(x = "Semantic coherence", y = "Exclusivity") + theme_minimal()
plot.searchK(storage)


#-------------------------------------------------------------------------------

# 4) STM model with different k from 3 to 15

# 3 topics
stm3 <- stm(dfm_tot, K = 3, seed = 1234)
plot(stm3)
labelTopics(stm3, topics = c(1:3), n = 10)

# plot
stm3_topics <- tidy(stm3, matrix = "beta")
stm3_top_terms <- stm3_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
top3 = stm3_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() + theme_minimal() + labs(y = "Beta", x = "Terms", 
                                        title = "Terms with highest beta for 3 K")
top3

###############

# 4 topics
stm4 <- stm(dfm_tot, K = 4, seed = 1234)
plot(stm4)
labelTopics(stm4, topics = c(1:4), n = 10)

# plot
stm4_topics <- tidy(stm4, matrix = "beta")
stm4_top_terms <- stm4_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
top4 = stm4_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() + theme_minimal() + labs(x = "Terms", y = "Beta", 
                                        title = "Terms with highest beta for 4 K")
top4

###############

# 5 topics
stm5 <- stm(dfm_tot, K = 5, seed = 1234)
plot(stm5)
labelTopics(stm5, topics = c(1:5), n = 10)

# plot
stm5_topics <- tidy(stm5, matrix = "beta")
stm5_top_terms <- stm5_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
top5 = stm5_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() + theme_minimal() + labs(x = "Terms", y = "Beta", 
                                        title = "Terms with highest beta for 5 K")
top5

###############

# 6 topics
stm6 <- stm(dfm_tot, K = 6, seed = 1234)
plot(stm6)
labelTopics(stm6, topics = c(1:6), n = 10)

# plot
stm6_topics <- tidy(stm6, matrix = "beta")
stm6_top_terms <- stm6_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
top6 = stm6_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() + theme_minimal() + labs(x = "Terms", y = "Beta", 
                                        title = "Terms with highest beta for 6 K")
top6

###############

# 7 topics
stm7 <- stm(dfm_tot, K = 7, seed = 1234)
plot(stm7)
labelTopics(stm7, topics = c(1:7), n = 10)

# plot
stm7_topics <- tidy(stm7, matrix = "beta")
stm7_top_terms <- stm7_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
top7 = stm7_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() + theme_minimal() + labs(x = "Terms", y = "Beta", 
                                        title = "Terms with highest beta for 7 K")
top7

###############

# 8 topics
stm8 <- stm(dfm_tot, K = 8, seed = 1234)
plot(stm8)
labelTopics(stm8, topics = c(1:8), n = 10)

# plot
stm8_topics <- tidy(stm8, matrix = "beta")
stm8_top_terms <- stm8_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
top8 = stm8_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() + theme_minimal() + labs(x = "Terms", y = "Beta", 
                                        title = "Terms with highest beta for 8 K")
top8

###############

# 9 topics
stm9 <- stm(dfm_tot, K = 9, seed = 1234)
plot(stm9)
labelTopics(stm9, topics = c(1:9), n = 10)

# plot
stm9_topics <- tidy(stm9, matrix = "beta")
stm9_top_terms <- stm9_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
top9 = stm9_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() + theme_minimal() + labs(x = "Terms", y = "Beta", 
                                        title = "Terms with highest beta for 9 K")
top9

###############

# 10 topics
stm10 <- stm(dfm_tot, K = 10, seed = 1234)
plot(stm10)
labelTopics(stm10, topics = c(1:10), n = 10)

# plot
stm10_topics <- tidy(stm10, matrix = "beta")
stm10_top_terms <- stm10_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
top10 = stm10_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() + theme_minimal() + labs(x = "Terms", y = "Beta", 
                                        title = "Terms with highest beta for 10 K")
top10

###############

# 11 topics
stm11 <- stm(dfm_tot, K = 11, seed = 1234)
plot(stm11)
labelTopics(stm11, topics = c(1:11), n = 10)

# plot
stm11_topics <- tidy(stm11, matrix = "beta")
stm11_top_terms <- stm11_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
top11 = stm11_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() + theme_minimal() + labs(x = "Terms", y = "Beta", 
                                        title = "Terms with highest beta for 11 K")
top11

###############

# 12 topics
stm12 <- stm(dfm_tot, K = 12, seed = 1234)
plot(stm12)
labelTopics(stm12, topics = c(1:12), n = 10)

# plot
stm12_topics <- tidy(stm12, matrix = "beta")
stm12_top_terms <- stm12_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
top12 = stm12_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() + theme_minimal() + labs(x = "Terms", y = "Beta", 
                                        title = "Terms with highest beta for 12 K")
top12

###############

# 13 topics
stm13 <- stm(dfm_tot, K = 13, seed = 1234)
plot(stm13)
labelTopics(stm13, topics = c(1:13), n = 10)

# plot
stm13_topics <- tidy(stm13, matrix = "beta")
stm13_top_terms <- stm13_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
top13 = stm13_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() + theme_minimal() + labs(x = "Terms", y = "Beta", 
                                        title = "Terms with highest beta for 13 K")
top13

###############

# 14 topics
stm14 <- stm(dfm_tot, K = 14, seed = 1234)
plot(stm14)
labelTopics(stm14, topics = c(1:14), n = 10)

# plot
stm14_topics <- tidy(stm14, matrix = "beta")
stm14_top_terms <- stm14_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
top14 = stm14_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() + theme_minimal() + labs(x = "Terms", y = "Beta", 
                                        title = "Terms with highest beta for 14 K")
top14

###############

# 15 topics
stm15 <- stm(dfm_tot, K = 15, seed = 1234)
plot(stm15)
labelTopics(stm15, topics = c(1:15), n = 10)

# plot
stm15_topics <- tidy(stm15, matrix = "beta")
stm15_top_terms <- stm15_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
top15 = stm15_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() + theme_minimal() + labs(x = "Terms", y = "Beta", 
                                        title = "Terms with highest beta for 15 K")
top15

###############

# 38 topics
stm38 <- stm(dfm_tot, K = 38, seed = 1234)
plot(stm38)
labelTopics(stm38, topics = c(1:38), n = 10)

# plot
stm38_topics <- tidy(stm38, matrix = "beta")
stm38_top_terms <- stm38_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
top38 = stm38_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() + theme_minimal() + labs(x = "Terms", y = "Beta", 
                                        title = "Terms with highest beta for 38 K")
top38

###############

# save built models
save(stm3, file = "stm3")
save(stm4, file = "stm4")
save(stm5, file = "stm5")
save(stm6, file = "stm6")
save(stm7, file = "stm7")
save(stm8, file = "stm8")
save(stm9, file = "stm9")
save(stm10, file = "stm10")
save(stm11, file = "stm11")
save(stm12, file = "stm12")
save(stm13, file = "stm13")
save(stm14, file = "stm14")
save(stm15, file = "stm15")
save(stm38, file = "stm38")


#-------------------------------------------------------------------------------

# 5) Chi-squared tests

# compare most used words by Assembly with Council
# group data
dfm_group <- dfm_group(dfm_tot, groups = Institution)

# calculate Chi2
keyness <- textstat_keyness(dfm_group, target = "A", measure = "chi2")

# visualize distribution
textplot_keyness(keyness, 
                 color = c("firebrick3", "lightblue"), n = 20) + 
  labs(x = "Frequency", y = "Association", 
       title = "Words most strongly associated to Assembly and Council") + 
  theme_minimal()

###############

# compare words used in resolutions and procès verbaux
# group data
dfm_group <- dfm_group(dfm_tot, groups = Type)

# calculate Chi2
keyness <- textstat_keyness(dfm_group, target = "RES", measure = "chi2")

# visualize distribution
textplot_keyness(keyness, 
                 color = c("firebrick3", "lightblue"), n = 20) + 
  labs(x = "Frequency", y = "Association",
       title = "Words most strongly associated to resolutions and procès verbaux") + 
  theme_minimal()


#-------------------------------------------------------------------------------

# 6) 6k STM model with covariates

# fit model with 6 topics and two covariates: countries and years
mod <- stm(dfm_tot, K = 6, prevalence = ~ Country + s(Date), seed = 1234)
plot(mod)
labelTopics(mod, topics = c(1:6), n = 20)

mod_topics <- tidy(mod, matrix = "beta")
mod_top_terms <- mod_topics %>%
  group_by(topic) %>%
  top_n(20, beta) %>%
  ungroup() %>%
  arrange(topic, beta)
modgr <- mod_top_terms %>% mutate(term = reorder(term, beta))
colourPalette <- c("goldenrod2", "indianred2", "palegreen4", "steelblue",
                   "royalblue4", "plum3")
ggplot(modgr, aes(reorder(term, beta), beta, fill = factor(topic))) +
  geom_col(position = "dodge", width = 0.7, show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() + theme_minimal() + scale_fill_manual(values = colourPalette) +
  labs(x = "Terms", y = "Beta", title = "6k STM with covariates") 

# extract topic proportion from model
topic_props <- mod$theta
dim(topic_props) # documents by K matrix
head(topic_props, 10) # betas 

# betas as log probability
word_logprobs <- mod$beta[[1]][[1]]
dim(word_logprobs) # K by vocabulary size

# prevalence of words in topics
# topic 1
topic1_highest <- order(word_logprobs[1,], decreasing = TRUE)
mod$vocab[topic1_highest[1:20]]

# topic 2
topic2_highest <- order(word_logprobs[2,], decreasing = TRUE)
mod$vocab[topic2_highest[1:20]]

# topic 3
topic3_highest <- order(word_logprobs[3,], decreasing = TRUE)
mod$vocab[topic3_highest[1:20]]

# topic 4
topic4_highest <- order(word_logprobs[4,], decreasing = TRUE)
mod$vocab[topic4_highest[1:20]]

# topic 5
topic5_highest <- order(word_logprobs[5,], decreasing = TRUE)
mod$vocab[topic5_highest[1:20]]

# topic 6
topic6_highest <- order(word_logprobs[6,], decreasing = TRUE)
mod$vocab[topic6_highest[1:20]]

#-------------------------------------------------------------------------------

# 7) Word cloud

pal <- wes_palette("Darjeeling1", 50, type = "continuous")
par(mfrow=c(1,1))

# topic 1 
cloud(mod, topic = 1, type= "model",  max.words = 50, color = pal, scale = c(1, 0.5))

# topic 2 
cloud(mod, topic = 2, type= "model", max.words = 50, color = pal, scale = c(1, 0.25))

# topic 3 
cloud(mod, topic = 3, type= "model", max.words = 50, color = pal, scale = c(2, 0.25))

# topic 4 
cloud(mod, topic = 4, type= "model", max.words = 50, color = pal, scale = c(1, 0.25))

# topic 5 
cloud(mod, topic = 5, type= "model", max.words = 50, color = pal, scale = c(1, 0.25))

# topic 6
cloud(mod, topic = 6, type= "model", max.words = 50, color = pal, scale = c(1, 0.25))


#-------------------------------------------------------------------------------

# 8) Exclusivity and semantic coherence validation

# check exclusivity of topics
dotchart(exclusivity(mod), labels = 1:6, color = "black", bg = "black", 
         main = "Exclusivity of topics", xlab = "Exclusivity", ylab = "Topics")

# check semantic coherence (how often words co-occur in same document)
cohere <- semanticCoherence(mod, dfm_tot)
dotchart(cohere, labels = 1:6, color = "black", bg = "black", 
         main = "Semantic coherence of topics", xlab = "Semantic coherence", 
         ylab = "Topics")


#-------------------------------------------------------------------------------

# 9) Most representative documents for topics

# thoughts 1
set.seed(1234)
thoughts1 <- findThoughts(mod, texts = data_complete$text, 
                          n = 7, topics = 1)
thoughts1$index
# 1461 1379 1462 1380 1381 1463 1267
data_complete[1461, ]
data_complete[1379, ]
data_complete[1267, ]
# S_RES_2368_2017_2017-07-20_AFG
# S_RES_2253_2015_2015-12-17_AFG
# S_RES_2083_2012_2012-12-17_AFG

# thoughts 2
thoughts2 <- findThoughts(mod, texts = data_complete$text, 
                          n = 3, topics = 2)
thoughts2$index
# 1312 1292 1277
data_complete[1312, ]
data_complete[1292, ]
data_complete[1277, ]
# S_RES_2156_2014_2014-05-29_SDN
# S_RES_2126_2013_2013-11-25_SDN
# S_RES_2104_2013_2013-05-29_SDN

# thoughts 3
thoughts3 <- findThoughts(mod, texts = data_complete$text, 
                          n = 3, topics = 3)
thoughts3$index
# 311 778 445
data_complete[311, ]
data_complete[778, ]
data_complete[445, ]
# A_RES_55-228_2001_2001-07-18_IDN
# A_RES_72-289_2018_2018-07-05_SSD
# A_RES_59-13_2005_2005-06-22_IDN

# thoughts 4
thoughts4 <- findThoughts(mod, texts = data_complete$text, 
                          n = 5, topics = 4)
thoughts4$index
# 819  818  804 1698
data_complete[818:819, ]
data_complete[1698, ]
data_complete[804, ]
# S_RES_1019_1995_1995-11-09_BIH
# S_RES_1004_1995_1995-07-12_BIH
# S_RES_871_1993_1993-10-04_HRV

# thoughts 5
thoughts5 <- findThoughts(mod, texts = data_complete$text, 
                          n = 3, topics = 5)
thoughts5$index
# 543 514 567
data_complete[543, ]
data_complete[506, ]
data_complete[567, ]
# A_RES_62-181_2007_2007-12-19_ISR
# A_RES_61-117_2006_2006-12-14_ISR
# A_RES_63-201_2008_2008-12-19_ISR

# thoughts 6
thoughts6 <- findThoughts(mod, texts = data_complete$text, 
                          n = 7, topics = 6)
thoughts6$index
# 57 55 58 30 90 28 88
data_complete[55:58, ]
data_complete[30, ]
data_complete[90, ]
# A_RES_47-107_1992_1992-12-16_SOM/TCD/LBR
# A_RES_46-108_1991_1991-12-16_SOM
# A_RES_48-118_1993_1993-12-20_SOM


#-------------------------------------------------------------------------------

# 10) Estimating effects

effect <- estimateEffect(formula= ~ s(Date) + Country, stmobj = mod, 
                         metadata = data_complete)
summary(effect)

###############

# a)  topic proportions (for all topics) over years
effect.time.tidy <- extract.estimateEffect(x = effect, 
                                           covariate = "Date", 
                                           model = mod, 
                                           method = "continuous")
effect.time.tidy$date <- as.Date("1970-01-01") + effect.time.tidy$covariate.value
effect.time.tidy$label <- factor(effect.time.tidy$label, 
                                 levels=c("Topic 1", "Topic 2", "Topic 3", 
                                          "Topic 4", "Topic 5", "Topic 6"))
effect.time.tidy %>% 
  ggplot(aes(x = date, y = estimate, fill = as.factor(topic), 
             ymin = ci.lower, ymax = ci.upper)) +
  facet_wrap(~ label, nrow = 4) +
  geom_ribbon(alpha = .6, show.legend = F) +
  geom_line(color = "black") +
  labs(x = "Year", y = "Expected Topic Proportion") +
  theme(text = element_text(size=8)) +
  theme(axis.text=element_text(size=9)) +
  theme(strip.text = element_text(size=12)) +
  scale_x_date(labels = date_format("%Y")) +
  ggtitle("Expected topic proportions over time") + theme_minimal() + 
  scale_fill_manual(values = colourPalette)

###############

# b) topic 2 (humanitarian language) vs. topic 4 (punitive dimension)
startTime <- as.Date("1990-03-30")
endTime <- as.Date("2019-01-01") 
start.end <- c(startTime,endTime)
colourPalette2 <- c("indianred2",  "steelblue")
effect.time.tidy %>% filter(topic == c(2, 4)) %>%
  ggplot(aes(x = date, y = estimate, fill = as.factor(topic),
             ymin = ci.lower, ymax = ci.upper)) +
  facet_wrap(~ label, nrow = 4) + geom_ribbon(alpha = .6, show.legend = F) +
  geom_line(color = "black") +
  labs(x = "Year",
       y = "Expected Topic Proportion") +
  theme(text = element_text(size=8)) + scale_fill_manual(values = colourPalette2) +
  theme(axis.text=element_text(size=9)) +
  theme(strip.text = element_text(size=12)) +
  scale_x_date(limits=start.end, labels = date_format("%Y"), 
               breaks = date_breaks("1 year")) +
  ggtitle("Expected topic proportions over time") + theme_minimal() + 
  geom_hline(yintercept = 0, linetype = 2, color = "black") + 
  geom_vline(xintercept = 11406, linetype = 2, color = "red") + 
  geom_vline(xintercept = 13000, linetype = 2, color = "red") +
  geom_vline(xintercept = 14306, linetype = 2, color = "red")

###############

# c) check robustness of findings: two topics of interest for 6, 8, 10, 12k

par(mfrow = c(2, 2)) 

# 6k
mod6 <- stm(dfm_tot, K = 6, prevalence = ~ Country + s(Year), seed = 1234)
labelTopics(mod6, topics = c(1:6), n = 10)
effect6 <- estimateEffect(formula= ~ s(Year) + Country, stmobj = mod6, 
                          metadata = data_complete)
plot.estimateEffect(effect6, covariate = "Year", model = mod8, topics = c(2, 4) , 
                    method = "continuous", main = "6k",
                    ylim = c(-0.2, 0.9), xlim = c(1990, 2019), printlegend = F, 
                    ci.level = 0,
                    linecol = c("indianred2", "steelblue"), lwd = 50, lty = c(1, 3))
abline(h=0,lty=2,lwd=1,col="grey45")

# with 8k
mod8 <- stm(dfm_tot, K = 8, prevalence = ~ Country + s(Year), seed = 1234)
labelTopics(mod8, topics = c(1:8), n = 10)
effect8 <- estimateEffect(formula= ~ s(Year) + Country, stmobj = mod8, 
                          metadata = data_complete)
plot.estimateEffect(effect8, covariate = "Year", model = mod8, topics = c(2, 4), 
                    method = "continuous", main = "8k",
                    ylim = c(-0.2, 0.9), xlim = c(1990, 2019), printlegend = F, 
                    ci.level = 0,
                    linecol = c("indianred2", "steelblue"), lwd = 10)
abline(h=0,lty=2,lwd=1,col="grey45")

# with 10k
mod10 <- stm(dfm_tot, K = 10, prevalence = ~ Country + s(Year), seed = 1234)
labelTopics(mod10, topics = c(1:10), n = 10)
effect10 <- estimateEffect(formula= ~ s(Year) + Country, stmobj = mod10, 
                           metadata = data_complete)
plot.estimateEffect(effect10, covariate = "Year", model = mod10, topics = c(8, 4), 
                    method = "continuous",
                    main = "10k", ylim = c(-0.2, 0.9), xlim = c(1990, 2019), 
                    printlegend = F, ci.level = 0,
                    linecol = c("indianred2", "steelblue"), lwd = 10)
abline(h=0,lty=2,lwd=1,col="grey45")

# with 12k
mod12 <- stm(dfm_tot, K = 12, prevalence = ~ Country + s(Year), seed = 1234)
labelTopics(mod12, topics = c(1:12), n = 10)
effect12 <- estimateEffect(formula= ~ s(Year) + Country, stmobj = mod12, 
                           metadata = data_complete)
plot.estimateEffect(effect12, covariate = "Year", model = mod12, topics = c(11, 4), 
                    method = "continuous", 
                    main = "12k", ylim = c(-0.2, 0.9), xlim = c(1990, 2019), 
                    printlegend = F, ci.level = 0,
                    linecol = c("indianred2", "steelblue"), lwd = 10)
abline(h=0,lty=2,lwd=1,col="grey45")
par(mfrow = c(1, 1)) 


################

# d) over country
esteffects <- get_effects(estimates = effect,
                          variable = "Country",
                          type = "pointestimate", ci = 0.95)

# topic 1
top1 <- esteffects %>% filter(topic == 1) %>%
  ggplot(aes(x = value, y = proportion)) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.1, size = 1) +
  geom_point(size = 3) +
  coord_flip() + theme_minimal() + labs(x = "Country", y = "Topic proportion", 
                                        title = "Topic 1") +
  geom_hline(yintercept = 0, linetype = 2, color = "red")

# topic 2
top2 <- esteffects %>% filter(topic == 2) %>%
  ggplot(aes(x = value, y = proportion)) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.1, size = 1) +
  geom_point(size = 3) +
  coord_flip() + theme_minimal() + labs(x = "Country", y = "Topic proportion", 
                                        title = "Topic 2") +
  geom_hline(yintercept = 0, linetype = 2, color = "red")

# topic 3
top3 <- esteffects %>% filter(topic == 3) %>%
  ggplot(aes(x = value, y = proportion)) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.1, size = 1) +
  geom_point(size = 3) +
  coord_flip() + theme_minimal() + labs(x = "Country", y = "Topic proportion", 
                                        title = "Topic 3") +
  geom_hline(yintercept = 0, linetype = 2, color = "red")

# topic 4
top4 <- esteffects %>% filter(topic == 4) %>%
  ggplot(aes(x = value, y = proportion)) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.1, size = 1) +
  geom_point(size = 3) +
  coord_flip() + theme_minimal() + labs(x = "Country", y = "Topic proportion", 
                                        title = "Topic 4") +
  geom_hline(yintercept = 0, linetype = 2, color = "red")

# topic 5
top5 <- esteffects %>% filter(topic == 5) %>%
  ggplot(aes(x = value, y = proportion)) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.1, size = 1) +
  geom_point(size = 3) +
  coord_flip() + theme_minimal() + labs(x = "Country", y = "Topic proportion", 
                                        title = "Topic 5") +
  geom_hline(yintercept = 0, linetype = 2, color = "red")

# topic 6
top6 <- esteffects %>% filter(topic == 6) %>%
  ggplot(aes(x = value, y = proportion)) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.1, size = 1) +
  geom_point(size = 3) +
  coord_flip() + theme_minimal() + labs(x = "Country", y = "Topic proportion", 
                                        title = "Topic 6") +
  geom_hline(yintercept = 0, linetype = 2, color = "red")
ggarrange(top1, top2, top3, top4, top5, top6)

###############

# e) per region
data_complete$Region <- countrycode(sourcevar = data_complete[, "Country"],
                                    origin = "iso3c",
                                    destination = "region")
effect2 <- estimateEffect(formula= ~ s(Date) + Region, stmobj = mod, 
                          metadata = data_complete)
esteffects2 <- get_effects(estimates = effect2,
                           variable = "Region",
                           type = "pointestimate", ci = 0.95)

# topic 1
regtop1 <- esteffects2 %>% filter(topic == 1) %>%
  ggplot(aes(x = value, y = proportion)) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.1, size = 1) +
  geom_point(size = 3) +
  coord_flip() + theme_minimal() + labs(x = "", y = "Topic proportion", 
                                        title = "Topic 1") +
  geom_hline(yintercept = 0, linetype = 2, color = "red")

# topic 2
regtop2 <- esteffects2 %>% filter(topic == 2) %>%
  ggplot(aes(x = value, y = proportion)) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.1, size = 1) +
  geom_point(size = 3) +
  coord_flip() + theme_minimal() + labs(x = "", y = "Topic proportion", 
                                        title = "Ground protection") +
  geom_hline(yintercept = 0, linetype = 2, color = "red")

# topic 3
regtop3 <- esteffects2 %>% filter(topic == 3) %>%
  ggplot(aes(x = value, y = proportion)) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.1, size = 1) +
  geom_point(size = 3) +
  coord_flip() + theme_minimal() + labs(x = "", y = "Topic proportion", 
                                        title = "Topic 3") +
  geom_hline(yintercept = 0, linetype = 2, color = "red")

# topic 4
regtop4 <- esteffects2 %>% filter(topic == 4) %>%
  ggplot(aes(x = value, y = proportion)) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.1, size = 1) +
  geom_point(size = 3) +
  coord_flip() + theme_minimal() + labs(x = "", y = "Topic proportion", 
                                        title = "Political-legal protection") +
  geom_hline(yintercept = 0, linetype = 2, color = "red")

# topic 5
regtop5 <- esteffects2 %>% filter(topic == 5) %>%
  ggplot(aes(x = value, y = proportion)) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.1, size = 1) +
  geom_point(size = 3) +
  coord_flip() + theme_minimal() + labs(x = "", y = "Topic proportion", 
                                        title = "Topic 5") +
  geom_hline(yintercept = 0, linetype = 2, color = "red")

# topic 6
regtop6 <- esteffects2 %>% filter(topic == 6) %>%
  ggplot(aes(x = value, y = proportion)) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.1, size = 1) +
  geom_point(size = 3) +
  coord_flip() + theme_minimal() + labs(x = "", y = "Topic proportion", 
                                        title = "Topic 6") +
  geom_hline(yintercept = 0, linetype = 2, color = "red")

ggarrange(regtop1, regtop2, regtop3, regtop4, regtop5, regtop6)
ggarrange(regtop2, regtop4)

################

# f) dominant topics per country

#aggregate per region
data_complete$Count <- countrycode(sourcevar = data_complete[, "Country"],
                                   origin = "iso3c",
                                   destination = "country.name")
effect3 <- estimateEffect(formula= ~ s(Date) + Count, stmobj = mod, 
                          metadata = data_complete)
summary(effect3)
esteffects3 <- get_effects(estimates = effect3,
                           variable = "Count",
                           type = "pointestimate", ci = 0.95)
options(stringsAsFactors = F)         
options("scipen" = 100, "digits" = 4) 
world <- ne_countries(scale = "medium", returnclass = "sf")

# label countries and dominant topics
# check dominant country through esteffects3
Country <- data.frame(Country = c("AFG", "AGO", "AZE", "BDI", "BGD", 
                                  "BIH", "CAF", "CIV", "COD", "COG", 
                                  "COL", "ERI", "ETH", "GEO", "GTM", 
                                  "HRV", "IDN", "IND", "IRN", "IRQ", 
                                  "ISR", "KEN", "LBN", "LBR", "LBY", 
                                  "MLI", "MMR", "MOZ", "NER", "PAK", 
                                  "PHL", "RUS", "RWA", "SDN", "SOM", 
                                  "SRB", "SSD", "SYR", "TCD", "UGA", 
                                  "UKR", "YEM", "ZAF"))
topics <- data.frame(topics = c(6, 4, 4, 4, 6, 
                                4, 2, 4, 4, 6, 
                                4, 4, 4, 4, 3, 
                                4, 3, 2, 4, 4, 
                                5, 4, 3, 6, 1,
                                3, 4, 6, 6, 2, 
                                6, 1, 4, 2, 1, 
                                4, 2, 4, 6, 4, 
                                4, 4, 6))

# create map
df <- data.frame(Country, topics)
visitedMap <- joinCountryData2Map(df, joinCode = "ISO3", 
                                  nameJoinColumn = "Country")
mapParams <- mapCountryData(visitedMap, nameColumnToPlot= "topics", 
                            oceanCol = "azure2", catMethod = "categorical", 
                            missingCountryCol = gray(.8), 
                            colourPalette = c("goldenrod2", "indianred2", 
                                              "palegreen4", "steelblue",
                                              "royalblue4", "plum3"), 
                            addLegend = F, border = T)
do.call(addMapLegendBoxes, c(mapParams, x = "bottom", title = " ", horiz = TRUE, 
                             bg = "transparent",
                             bty = "n", cex = 0.5, pt.cex = 3))

# only 2 and 4 taken from esteffects3
Country <- data.frame(Country = c("AFG", "AGO", "AZE", "BDI", "BGD", 
                                  "BIH", "CAF", "CIV", "COD", "COG", 
                                  "COL", "ERI", "ETH", "GEO", "GTM", 
                                  "HRV", "IDN", "IND", "IRN", "IRQ", 
                                  "ISR", "KEN", "LBN", "LBR", "LBY", 
                                  "MLI", "MMR", "MOZ", "NER", "PAK", 
                                  "PHL", "RUS", "RWA", "SDN", "SOM", 
                                  "SRB", "SSD", "SYR", "TCD", "UGA", 
                                  "UKR", "YEM", "ZAF"))
topics <- data.frame(topics = c("Topic 4", "Topic 4", "Topic 4", "Topic 4", 
                                "Topic 2", "Topic 4", "Topic 2", "Topic 4", 
                                "Topic 4", "Topic 2", "Topic 4", "Topic 4", 
                                "Topic 4", "Topic 4", "Topic 4", "Topic 4", 
                                "Topic 2", "Topic 2", "Topic 4", "Topic 4", 
                                "Topic 4", "Topic 4", "Topic 4", "Topic 2", 
                                "Topic 4", "Topic 2", "Topic 4", "Topic 4", 
                                "Topic 2", "Topic 2", "Topic 2", "Topic 4", 
                                "Topic 4", "Topic 2", "Topic 2", "Topic 4", 
                                "Topic 2", "Topic 4", "Topic 2", "Topic 4", 
                                "Topic 4", "Topic 4", "Topic 2"))

df <- data.frame(Country, topics)
par(mfrow = c(1, 1)) 
visitedMap <- joinCountryData2Map(df, joinCode = "ISO3", 
                                  nameJoinColumn = "Country")
mapParams <- mapCountryData(visitedMap, nameColumnToPlot= "topics", 
                            oceanCol = "azure2", catMethod = "categorical", 
                            missingCountryCol = gray(.8),
                            colourPalette = c("indianred2", "steelblue"), 
                            addLegend = F, border = T)
do.call(addMapLegendBoxes, c(mapParams, x = "bottom", title = " ", horiz = TRUE, 
                             bg = "transparent",
                             bty = "n", cex = 1, pt.cex = 3))

################

# g) Measure of heterogeneity of topics across map from esteffects3

# topic 1
topic1 <- data.frame(topic1 = c(22, 11, 11, 3, 11, 
                                15, 4, 9, 8, 11, 
                                15, 6, 2, 7, 10,
                                13, 4, 22, 7, 22, 
                                0, 11, 5, 15, 43, 
                                6, 2, 11, 25, 18,
                                14, 64, 13, 1, 36, 
                                14, 0, 10, 10, 10, 
                                9, 21, 19))
dftopic1 <- data.frame(Country, topic1)
visitedMap <- joinCountryData2Map(dftopic1, joinCode = "ISO3", 
                                  nameJoinColumn = "Country")
colourPalette2 <- brewer.pal(5,'Greens')
mapParams <- mapCountryData(visitedMap, nameColumnToPlot= "topic1", 
                            oceanCol = "azure2", numCats = 5, 
                            catMethod = "pretty", missingCountryCol = gray(.8),
                            addLegend = F, 
                            mapTitle = "Measures of heterogeneity Topic 1", 
                            border = T, colourPalette = colourPalette2)
do.call(addMapLegendBoxes, c(mapParams, x = "bottom", title = "Proportion %", 
                             horiz = TRUE, bg = "transparent",
                             bty = "n", cex = 0.5, pt.cex = 3))

# topic 2
topic2 <- data.frame(topic2 = c(13, 13, 12, 16, 12, 
                                11, 36, 22, 26, 10, 
                                27, 29, 10, 10, 18,
                                11, 18, 57, 0, 5, 
                                2, 25, 11, 17, 10, 
                                33, 7, 18, 10, 60,
                                13, 7, 10, 44, 21, 
                                11, 48, 0, 18, 15, 
                                0, 15, 14))
dftopic2 <- data.frame(Country, topic2)
visitedMap <- joinCountryData2Map(dftopic2, joinCode = "ISO3", 
                                  nameJoinColumn = "Country")
colourPalette2 <- brewer.pal(5,'Reds')
mapParams <- mapCountryData(visitedMap, nameColumnToPlot= "topic2", 
                            oceanCol = "azure2", numCats = 5, catMethod = "pretty", 
                            missingCountryCol = gray(.8), addLegend = F, 
                            mapTitle = "Measures of heterogeneity Topic 2", 
                            border = T, colourPalette = colourPalette2)
do.call(addMapLegendBoxes, c(mapParams, x = "bottom", title = "Proportion %", 
                             horiz = TRUE, bg = "transparent",
                             bty = "n", cex = 0.5, pt.cex = 3))

# topic 3
Country <- data.frame(Country = c("AFG", "AGO", "AZE", "BDI", "BGD", 
                                  "BIH", "CAF", "CIV", "COD", "COG", 
                                  "COL", "ERI", "ETH", "GEO", "GTM", 
                                  "HRV", "IDN", "IND", "IRN", "IRQ", 
                                  "ISR", "KEN", "LBN", "LBR", "LBY", 
                                  "MLI", "MMR", "MOZ", "NER", "PAK", 
                                  "PHL", "RUS", "RWA", "SDN", "SOM", 
                                  "SRB", "SSD", "SYR", "TCD", "UGA", 
                                  "UKR", "YEM", "ZAF"))
topic3 <- data.frame(topic3 = c(4, 27, 3, 21, 3, 
                                12, 35, 27, 24, 7, 
                                3, 5, 32, 26, 36,
                                14, 50, 3, 2, 20, 
                                12, 2, 30, 20, 2, 
                                38, 1, 17, 5, 4,
                                23, 3, 26, 29, 7, 
                                16, 30, 18, 29, 25, 
                                4, 9, 16))
dftopic3 <- data.frame(Country, topic3)
visitedMap <- joinCountryData2Map(dftopic3, joinCode = "ISO3", 
                                  nameJoinColumn = "Country")
colourPalette3 <- brewer.pal(5,'BuPu')
mapParams <- mapCountryData(visitedMap, nameColumnToPlot= "topic3", 
                            oceanCol = "azure2", numCats = 5, catMethod = "pretty", 
                            missingCountryCol = gray(.8), addLegend = F, 
                            mapTitle = "Measures of heterogeneity Topic 3", 
                            border = T, colourPalette = colourPalette3)
do.call(addMapLegendBoxes, c(mapParams, x = "bottom", title = "Proportion %", 
                             horiz = TRUE, bg = "transparent",
                             bty = "n", cex = 0.5, pt.cex = 3))

# topic 4
topic4 <- data.frame(topic4 = c(20, 46, 50, 45, 0, 
                                71, 18, 36, 35, 0, 
                                53, 45, 33, 47, 26,
                                69, 9, 2, 65, 44, 
                                6, 58, 23, 13, 33, 
                                18, 76, 23, 0, 2,
                                0, 24, 42, 15, 15, 
                                67, 22, 63, 12, 39, 
                                67, 49, 9))
dftopic4 <- data.frame(Country, topic4)
visitedMap <- joinCountryData2Map(dftopic4, joinCode = "ISO3", 
                                  nameJoinColumn = "Country")
colourPalette4 <- brewer.pal(5,'Blues')
mapParams <- mapCountryData(visitedMap, nameColumnToPlot= "topic4", 
                            oceanCol = "azure2", numCats = 6, catMethod = "pretty", 
                            missingCountryCol = gray(.8), addLegend = F, 
                            mapTitle = "Measure of heterogeneity Topic 4", 
                            border = T, colourPalette = colourPalette4)
do.call(addMapLegendBoxes, c(mapParams, x = "bottom", title = "Proportion %", 
                             horiz = T, bg = "transparent",
                             bty = "n", cex = 0.5, pt.cex = 3))

# topic 5
topic5 <- data.frame(topic5 = c(2, 2, 21, 1, 0, 
                                2, 1, 0, 1, 2, 
                                1, 2, 0, 8, 4, 
                                3, 4, 2, 9, 2, 
                                70, 1, 24, 2, 2, 
                                1, 1, 1, 4, 2,
                                0, 3, 3, 1, 1, 
                                1, 1, 3, 2, 2, 
                                15, 2, 1))
dftopic5 <- data.frame(Country, topic5)
visitedMap <- joinCountryData2Map(dftopic5, joinCode = "ISO3", 
                                  nameJoinColumn = "Country")
colourPalette5 <- brewer.pal(5,'RdPu')
mapParams <- mapCountryData(visitedMap, nameColumnToPlot= "topic5", 
                            oceanCol = "azure2", numCats = 5, catMethod = "pretty", 
                            missingCountryCol = gray(.8), addLegend = F, 
                            mapTitle = "Measure of heterogeneity Topic 5", 
                            border = T, colourPalette = colourPalette5)
do.call(addMapLegendBoxes, c(mapParams, x = "bottom", title = "Proportion %", 
                             horiz = TRUE, bg = "transparent",
                             bty = "n", cex = 0.5, pt.cex = 3))

# topic 6
topic6 <- data.frame(topic6 = c(39, 1, 2, 14, 79, 
                                0, 6, 6, 7, 78, 
                                2, 13, 22, 1, 7,
                                0, 15, 15, 14, 7, 
                                9, 4, 6, 32, 10, 
                                6, 13, 30, 66, 15,
                                64, 0, 6, 9, 20, 
                                0, 4, 6, 29, 8, 
                                8, 6, 30))
dftopic6 <- data.frame(Country, topic6)
visitedMap <- joinCountryData2Map(dftopic6, joinCode = "ISO3", 
                                  nameJoinColumn = "Country")
colourPalette6 <- brewer.pal(5,'Oranges')
mapParams <- mapCountryData(visitedMap, nameColumnToPlot= "topic6", 
                            oceanCol = "azure2", numCats = 5, catMethod = "pretty", 
                            missingCountryCol = gray(.8), addLegend = F, 
                            mapTitle = "Measure of heterogeneity Topic 6", 
                            border = T, colourPalette = colourPalette6)
do.call(addMapLegendBoxes, c(mapParams, x = "bottom", title = "Proportion %", 
                             horiz = TRUE, bg = "transparent",
                             bty = "n", cex = 0.5, pt.cex = 3))

#-------------------------------------------------------------------------------

# 11) Visual representation of breakpoints

# breakpoint 1
startTime1 <- as.Date("2000-01-01")
endTime1 <- as.Date("2002-03-31")
ribon.start1 <- as.Date("2001-01-01")
ribon.end1 <- as.Date("2001-03-31")
start.end1 <- c(startTime1,endTime1)
start.end1

effect.time.tidy %>% filter(topic == 2) %>%
  ggplot(aes(x = date, y = estimate, fill = as.factor(topic),
             ymin = ci.lower, ymax = ci.upper)) +
  facet_wrap(~ label, nrow = 4) + 
  geom_ribbon(alpha = .6, show.legend = F, fill = "gray") +
  geom_line(color = "red") +
  labs(x = "Time",
       y = "Expected topic proportion") +
  theme(text = element_text(size=8)) +
  theme(axis.text=element_text(size=9)) +
  theme(strip.text = element_text(size=12)) +
  scale_x_date(limits=start.end1, labels = date_format("%m/%Y"), 
               breaks = date_breaks("2 months")) +
  ggtitle("Expected breakpoint 1: Quarter 1 - 2001") + theme_minimal() + 
  geom_hline(yintercept = -0.08, linetype = 2, color = "black") +
  geom_rect(aes(xmin = ribon.start1, xmax = ribon.end1, ymin = -Inf, ymax = Inf), 
            fill = "green", alpha = 0.009)

# breakpoint 2
startTime2 <- as.Date("2004-10-01")
endTime2 <- as.Date("2006-12-31")
ribon.start2 <- as.Date("2005-10-01")
ribon.end2 <- as.Date("2005-12-31")
start.end2 <- c(startTime2,endTime2)
start.end2

effect.time.tidy %>% filter(topic == 2) %>%
  ggplot(aes(x = date, y = estimate, fill = as.factor(topic),
             ymin = ci.lower, ymax = ci.upper)) +
  facet_wrap(~ label, nrow = 4) + 
  geom_ribbon(alpha = .6, show.legend = F, fill = "gray") +
  geom_line(color = "red") +
  labs(x = "Time",
       y = "Expected topic proportion") +
  theme(text = element_text(size=8)) +
  theme(axis.text=element_text(size=9)) +
  theme(strip.text = element_text(size=12)) +
  scale_x_date(limits=start.end2, labels = date_format("%m/%Y"), 
               breaks = date_breaks("2 months")) +
  ggtitle("Expected breakpoint 2: Quarter 4 - 2005") + theme_minimal() + 
  geom_hline(yintercept = 0.038, linetype = 2, color = "black") +
  geom_rect(aes(xmin = ribon.start2, xmax = ribon.end2, ymin = -Inf, ymax = Inf), 
            fill = "green", alpha = 0.007)

# breakpoint 3
startTime3 <- as.Date("2007-10-01")
endTime3 <- as.Date("2010-01-01")
ribon.start3 <- as.Date("2008-10-01")
ribon.end3 <- as.Date("2008-12-31")
start.end3 <- c(startTime3,endTime3)
start.end3

effect.time.tidy %>% filter(topic == 2) %>%
  ggplot(aes(x = date, y = estimate, fill = as.factor(topic),
             ymin = ci.lower, ymax = ci.upper)) +
  facet_wrap(~ label, nrow = 4) + 
  geom_ribbon(alpha = .6, show.legend = F, fill = "gray") +
  geom_line(color = "red") +
  labs(x = "Time",
       y = "Expected topic proportion") +
  theme(text = element_text(size=8)) +
  theme(axis.text=element_text(size=9)) +
  theme(strip.text = element_text(size=12)) +
  scale_x_date(limits=start.end3, labels = date_format("%m/%Y"), 
               breaks = date_breaks("2 months")) +
  ggtitle("Expected breakpoint 3: Quarter 4 - 2008") + theme_minimal() + 
  geom_hline(yintercept = 0.043, linetype = 2, color = "black") +
  geom_rect(aes(xmin = ribon.start3, xmax = ribon.end3, ymin = -Inf, ymax = Inf), 
            fill = "green", alpha = 0.007)

#-------------------------------------------------------------------------------

# 12) Replication analysis with two different corpora (UNSC and UNGA)

# only Council documents
subset_council <- dfm_tot[dfm_tot$Institution == "S", ]
mod_council <- stm(subset_council, K = 6, prevalence = ~ Country + s(Date), 
                   seed = 1234)
plot(mod_council)
labelTopics(mod_council, topics = c(1:6), n = 20)

mod_council_topics <- tidy(mod_council, matrix = "beta")
mod_council_top_terms <- mod_council_topics %>%
  group_by(topic) %>%
  top_n(20, beta) %>%
  ungroup() %>%
  arrange(topic, beta)
mod_council <- mod_council_top_terms %>% mutate(term = reorder(term, beta))
ggplot(mod_council, aes(reorder(term, beta), beta, fill = factor(topic))) +
  geom_col(position = "dodge", width = 0.7, show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() + theme_minimal() + 
  labs(x = "Terms", y = "Beta", title = "6k STM Council with covariates") 


# only Assembly documents
subset_assembly <- dfm_tot[dfm_tot$Institution == "A", ]
mod_assembly <- stm(subset_assembly, K = 6, prevalence = ~ Country + s(Date), 
                    seed = 1234)
plot(mod_assembly)
labelTopics(mod_assembly, topics = c(1:6), n = 20)

mod_assembly_topics <- tidy(mod_assembly, matrix = "beta")
mod_assembly_top_terms <- mod_assembly_topics %>%
  group_by(topic) %>%
  top_n(20, beta) %>%
  ungroup() %>%
  arrange(topic, beta)
mod_assembly <- mod_assembly_top_terms %>% mutate(term = reorder(term, beta))
ggplot(mod_assembly, aes(reorder(term, beta), beta, fill = factor(topic))) +
  geom_col(position = "dodge", width = 0.7, show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() + theme_minimal() + 
  labs(x = "Terms", y = "Beta", title = "6k STM Assembly with covariates") 
